All pandas objects are value mutable but may not be size mutable. Lenght of a Series cannot be changed but columns can be inserted into a DataFrame.


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
# Creating a Series Object
s =  pd.Series([1,2,3,np.nan, 'a', 4+5j])
s


Out[5]:
0         1
1         2
2         3
3       NaN
4         a
5    (4+5j)
dtype: object

In [8]:
# To generate dates
dates = pd.date_range('20130125', periods = 8)
dates


Out[8]:
DatetimeIndex(['2013-01-25', '2013-01-26', '2013-01-27', '2013-01-28',
               '2013-01-29', '2013-01-30', '2013-01-31', '2013-02-01'],
              dtype='datetime64[ns]', freq='D')

In [12]:
# Creating a DataFrame
dates = pd.date_range('20180101', periods = 6)
df = pd.DataFrame(np.random.rand(6,4), index = dates, columns = list('ABCD'))
df


Out[12]:
A B C D
2018-01-01 0.690510 0.290334 0.498668 0.724776
2018-01-02 0.928369 0.994264 0.808196 0.509202
2018-01-03 0.032481 0.079715 0.842365 0.575144
2018-01-04 0.605776 0.584509 0.324846 0.378899
2018-01-05 0.962642 0.736450 0.469180 0.705774
2018-01-06 0.016343 0.898750 0.483067 0.883672

In [15]:
# Creating a DataFrame from a dictionary
temp_dict = {'A':1,
            'B':pd.Series([2,3,4,5]),
            'C':np.array([3]*4, dtype='int32')}
df2 = pd.DataFrame(temp_dict)
df2


Out[15]:
A B C
0 1 2 3
1 1 3 3
2 1 4 3
3 1 5 3

In [16]:
# To get the dtype for every column
df2.dtypes


Out[16]:
A    int64
B    int64
C    int32
dtype: object

In [20]:
# To get the first x entries use pd.head(x) default = 5
df.head()


Out[20]:
A B C D
2018-01-01 0.690510 0.290334 0.498668 0.724776
2018-01-02 0.928369 0.994264 0.808196 0.509202
2018-01-03 0.032481 0.079715 0.842365 0.575144
2018-01-04 0.605776 0.584509 0.324846 0.378899
2018-01-05 0.962642 0.736450 0.469180 0.705774

In [21]:
# To view last x entries
df.tail(3)


Out[21]:
A B C D
2018-01-04 0.605776 0.584509 0.324846 0.378899
2018-01-05 0.962642 0.736450 0.469180 0.705774
2018-01-06 0.016343 0.898750 0.483067 0.883672

In [22]:
# To get the index of the dataframe
df.index


Out[22]:
DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06'],
              dtype='datetime64[ns]', freq='D')

In [23]:
# To get index of columns
df.columns


Out[23]:
Index(['A', 'B', 'C', 'D'], dtype='object')

In [24]:
# To get a quick statistic summary of a dataFrame
df.describe()


Out[24]:
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean 0.539354 0.597337 0.571054 0.629578
std 0.421480 0.354993 0.206797 0.178732
min 0.016343 0.079715 0.324846 0.378899
25% 0.175805 0.363878 0.472652 0.525688
50% 0.648143 0.660480 0.490868 0.640459
75% 0.868904 0.858175 0.730814 0.720026
max 0.962642 0.994264 0.842365 0.883672

In [26]:
# To transpose your DataFrame
df.T


Out[26]:
2018-01-01 00:00:00 2018-01-02 00:00:00 2018-01-03 00:00:00 2018-01-04 00:00:00 2018-01-05 00:00:00 2018-01-06 00:00:00
A 0.690510 0.928369 0.032481 0.605776 0.962642 0.016343
B 0.290334 0.994264 0.079715 0.584509 0.736450 0.898750
C 0.498668 0.808196 0.842365 0.324846 0.469180 0.483067
D 0.724776 0.509202 0.575144 0.378899 0.705774 0.883672

In [29]:
# To sort by an axis

# axis 0 mean along the columns i.e. the dates got sorted in descending order
df.sort_index(axis=0, ascending=False)


Out[29]:
A B C D
2018-01-06 0.016343 0.898750 0.483067 0.883672
2018-01-05 0.962642 0.736450 0.469180 0.705774
2018-01-04 0.605776 0.584509 0.324846 0.378899
2018-01-03 0.032481 0.079715 0.842365 0.575144
2018-01-02 0.928369 0.994264 0.808196 0.509202
2018-01-01 0.690510 0.290334 0.498668 0.724776

In [31]:
# To arrange along the columns
df.sort_index(axis=1, ascending=False)


Out[31]:
D C B A
2018-01-01 0.724776 0.498668 0.290334 0.690510
2018-01-02 0.509202 0.808196 0.994264 0.928369
2018-01-03 0.575144 0.842365 0.079715 0.032481
2018-01-04 0.378899 0.324846 0.584509 0.605776
2018-01-05 0.705774 0.469180 0.736450 0.962642
2018-01-06 0.883672 0.483067 0.898750 0.016343

In [34]:
# To sort by specific values
df.sort_values(by='C', ascending=False)


Out[34]:
A B C D
2018-01-03 0.032481 0.079715 0.842365 0.575144
2018-01-02 0.928369 0.994264 0.808196 0.509202
2018-01-01 0.690510 0.290334 0.498668 0.724776
2018-01-06 0.016343 0.898750 0.483067 0.883672
2018-01-05 0.962642 0.736450 0.469180 0.705774
2018-01-04 0.605776 0.584509 0.324846 0.378899

In [35]:
# For selecting elements

In [36]:
# To select columns
df['A']


Out[36]:
2018-01-01    0.690510
2018-01-02    0.928369
2018-01-03    0.032481
2018-01-04    0.605776
2018-01-05    0.962642
2018-01-06    0.016343
Freq: D, Name: A, dtype: float64

In [37]:
df[0:3]


Out[37]:
A B C D
2018-01-01 0.690510 0.290334 0.498668 0.724776
2018-01-02 0.928369 0.994264 0.808196 0.509202
2018-01-03 0.032481 0.079715 0.842365 0.575144

In [39]:
df[0:4].A


Out[39]:
2018-01-01    0.690510
2018-01-02    0.928369
2018-01-03    0.032481
2018-01-04    0.605776
Freq: D, Name: A, dtype: float64

In [46]:
# To select by labels
dates


Out[46]:
DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06'],
              dtype='datetime64[ns]', freq='D')

In [51]:
# To select a row
df.loc['2018-01-01']


Out[51]:
A    0.690510
B    0.290334
C    0.498668
D    0.724776
Name: 2018-01-01 00:00:00, dtype: float64

In [53]:
# To select from multiple columns
df.loc[:, ['A', 'B']]


Out[53]:
A B
2018-01-01 0.690510 0.290334
2018-01-02 0.928369 0.994264
2018-01-03 0.032481 0.079715
2018-01-04 0.605776 0.584509
2018-01-05 0.962642 0.736450
2018-01-06 0.016343 0.898750

In [56]:
# To get the cell values
df.loc['2018-01-01', 'B']


Out[56]:
0.29033430143768058

In [63]:
# Use this method for fast access than the previous one
df.at[dates[0], 'B']   # Where dates[0] = row name


Out[63]:
0.29033430143768058

In [67]:
dates[0]


Out[67]:
Timestamp('2018-01-01 00:00:00', freq='D')

In [68]:
df


Out[68]:
A B C D
2018-01-01 0.690510 0.290334 0.498668 0.724776
2018-01-02 0.928369 0.994264 0.808196 0.509202
2018-01-03 0.032481 0.079715 0.842365 0.575144
2018-01-04 0.605776 0.584509 0.324846 0.378899
2018-01-05 0.962642 0.736450 0.469180 0.705774
2018-01-06 0.016343 0.898750 0.483067 0.883672

In [66]:
# Now to operate on DataFrame using indexing without the index names
df.iloc[3]


Out[66]:
A    0.605776
B    0.584509
C    0.324846
D    0.378899
Name: 2018-01-04 00:00:00, dtype: float64

In [69]:
# Here the first argument returns the number of rows and
# the second argument returns the number of columns
df.iloc[1:3 , 0:3]


Out[69]:
A B C
2018-01-02 0.928369 0.994264 0.808196
2018-01-03 0.032481 0.079715 0.842365

In [76]:
# To select specific columns
df.iloc[[1,2,4], :]


Out[76]:
A B C D
2018-01-02 0.928369 0.994264 0.808196 0.509202
2018-01-03 0.032481 0.079715 0.842365 0.575144
2018-01-05 0.962642 0.736450 0.469180 0.705774

In [71]:
df.iloc[0,1]


Out[71]:
0.29033430143768058

In [72]:
# To get fast access use
df.iat[0,1]


Out[72]:
0.29033430143768058

In [73]:
df.iat[0,1] = 1

In [74]:
df.at[dates[0], 'B'] = 2

In [75]:
df.iat[0,1]


Out[75]:
2.0

In [79]:
# to make DataFrames by conditioning
df[df.A > 0.5]


Out[79]:
A B C D
2018-01-01 0.690510 2.000000 0.498668 0.724776
2018-01-02 0.928369 0.994264 0.808196 0.509202
2018-01-04 0.605776 0.584509 0.324846 0.378899
2018-01-05 0.962642 0.736450 0.469180 0.705774

In [83]:
df[df > 0.5]


Out[83]:
A B C D
2018-01-01 0.690510 2.000000 NaN 0.724776
2018-01-02 0.928369 0.994264 0.808196 0.509202
2018-01-03 NaN NaN 0.842365 0.575144
2018-01-04 0.605776 0.584509 NaN NaN
2018-01-05 0.962642 0.736450 NaN 0.705774
2018-01-06 NaN 0.898750 NaN 0.883672

In [95]:
# To add new columns to DataFrame
df['E'] = ['one', 'one','two','three','four','three']
df


Out[95]:
A B C D F E
2018-01-01 0.690510 2.000000 NaN 0.724776 NaN one
2018-01-02 0.928369 0.994264 0.808196 0.509202 NaN one
2018-01-03 NaN NaN 0.842365 0.575144 NaN two
2018-01-04 0.605776 0.584509 NaN NaN NaN three
2018-01-05 0.962642 0.736450 NaN 0.705774 NaN four
2018-01-06 NaN 0.898750 NaN 0.883672 NaN three

In [87]:
df['F'] = pd.Series([1,2,3,4,5,6])
df


Out[87]:
A B C D E F
2018-01-01 0.690510 2.000000 0.498668 0.724776 one NaN
2018-01-02 0.928369 0.994264 0.808196 0.509202 one NaN
2018-01-03 0.032481 0.079715 0.842365 0.575144 two NaN
2018-01-04 0.605776 0.584509 0.324846 0.378899 three NaN
2018-01-05 0.962642 0.736450 0.469180 0.705774 four NaN
2018-01-06 0.016343 0.898750 0.483067 0.883672 three NaN

In [88]:
# To select specific rows with desired values
df[df['E'].isin(['two', 'three'])]


Out[88]:
A B C D E F
2018-01-03 0.032481 0.079715 0.842365 0.575144 two NaN
2018-01-04 0.605776 0.584509 0.324846 0.378899 three NaN
2018-01-06 0.016343 0.898750 0.483067 0.883672 three NaN

In [96]:
# Missing data
# By default represented by np.nan
df = df[df > 0.5]
df


Out[96]:
A B C D F E
2018-01-01 0.690510 2.000000 NaN 0.724776 NaN one
2018-01-02 0.928369 0.994264 0.808196 0.509202 NaN one
2018-01-03 NaN NaN 0.842365 0.575144 NaN two
2018-01-04 0.605776 0.584509 NaN NaN NaN three
2018-01-05 0.962642 0.736450 NaN 0.705774 NaN four
2018-01-06 NaN 0.898750 NaN 0.883672 NaN three

In [97]:
del df['E']
df


Out[97]:
A B C D F
2018-01-01 0.690510 2.000000 NaN 0.724776 NaN
2018-01-02 0.928369 0.994264 0.808196 0.509202 NaN
2018-01-03 NaN NaN 0.842365 0.575144 NaN
2018-01-04 0.605776 0.584509 NaN NaN NaN
2018-01-05 0.962642 0.736450 NaN 0.705774 NaN
2018-01-06 NaN 0.898750 NaN 0.883672 NaN

In [99]:
# To change index of DataFrame
df.reindex(index=['one', 'two', 'three', 'four', 'five', 'six'], 
           columns=list('ZYXWV'))


Out[99]:
Z Y X W V
one NaN NaN NaN NaN NaN
two NaN NaN NaN NaN NaN
three NaN NaN NaN NaN NaN
four NaN NaN NaN NaN NaN
five NaN NaN NaN NaN NaN
six NaN NaN NaN NaN NaN

In [101]:
# To drop any row having missing values
df.dropna(how='any')
# If the row contains a single Nan it would be dropped


Out[101]:
A B C D F

In [102]:
# All the entries in a row must by nan to get dropped
df.dropna(how='all')


Out[102]:
A B C D F
2018-01-01 0.690510 2.000000 NaN 0.724776 NaN
2018-01-02 0.928369 0.994264 0.808196 0.509202 NaN
2018-01-03 NaN NaN 0.842365 0.575144 NaN
2018-01-04 0.605776 0.584509 NaN NaN NaN
2018-01-05 0.962642 0.736450 NaN 0.705774 NaN
2018-01-06 NaN 0.898750 NaN 0.883672 NaN

In [104]:
# To replace Nan with desired values
df.fillna(value='Value Filled')


Out[104]:
A B C D F
2018-01-01 0.69051 2 Value Filled 0.724776 Value Filled
2018-01-02 0.928369 0.994264 0.808196 0.509202 Value Filled
2018-01-03 Value Filled Value Filled 0.842365 0.575144 Value Filled
2018-01-04 0.605776 0.584509 Value Filled Value Filled Value Filled
2018-01-05 0.962642 0.73645 Value Filled 0.705774 Value Filled
2018-01-06 Value Filled 0.89875 Value Filled 0.883672 Value Filled

In [106]:
# TO get boolean mask of the DataFrame
df.isna()
# False if not Nan and True if Nan


Out[106]:
A B C D F
2018-01-01 False False True False True
2018-01-02 False False False False True
2018-01-03 True True False False True
2018-01-04 False False True True True
2018-01-05 False False True False True
2018-01-06 True False True False True

In [107]:
# Various operations on a DataFrame

In [114]:
df = pd.DataFrame(10*np.random.rand(6,4),
                 index = pd.date_range('20180101', periods=6),
                 columns=list('ABCD'))
df


Out[114]:
A B C D
2018-01-01 8.917154 3.445613 5.064817 4.348126
2018-01-02 2.895181 8.083756 8.989853 8.996414
2018-01-03 2.440471 4.622014 7.497869 5.880346
2018-01-04 4.087635 7.287993 6.382507 6.802648
2018-01-05 3.427748 0.970122 1.782497 0.451094
2018-01-06 6.467148 9.609664 3.593324 4.125862

In [117]:
df.mean() # along axis = 0


Out[117]:
A    4.705890
B    5.669860
C    5.551811
D    5.100748
dtype: float64

In [118]:
# To apply some random function
df.apply(lambda x: x.max()-x.min())


Out[118]:
A    6.476684
B    8.639543
C    7.207357
D    8.545320
dtype: float64

In [120]:
# To get value counts
df['A'].value_counts()


Out[120]:
8.917154    1
4.087635    1
2.895181    1
2.440471    1
3.427748    1
6.467148    1
Name: A, dtype: int64

In [123]:
# To merge DataFrames use merge
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})

print(left)
print(right)

pd.merge(left, right)


   key  lval
0  foo     1
1  foo     2
   key  rval
0  foo     4
1  foo     5
Out[123]:
key lval rval
0 foo 1 4
1 foo 1 5
2 foo 2 4
3 foo 2 5